Installing Packages and Libraries

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rvest)
library(corrplot)
## corrplot 0.90 loaded
library(ggplot2)
library(cluster)
library(fpc)
library(pvclust)
library(mclust)
## Package 'mclust' version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.5     ✓ purrr   0.3.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand()         masks Matrix::expand()
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::map()            masks mclust::map()
## x tidyr::pack()           masks Matrix::pack()
## x tidyr::unpack()         masks Matrix::unpack()
library(bestglm)
## Loading required package: leaps
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
library(Rcpp)
library(reactable)

Importing Data

##### per game stats (FGA, 3PA, RB, AST, STL, BLK, TOV)
per_game_url = "https://www.basketball-reference.com/leagues/NBA_2021.html"

per_game_page = per_game_url %>% 
  rvest::read_html()

per_game_df = per_game_page %>% 
  rvest::html_nodes("table") %>% 
  .[7] %>% 
  rvest::html_table(fill = TRUE)


#### advanced team stats
per_game_url = "https://www.basketball-reference.com/leagues/NBA_2021.html"

per_game_page = per_game_url %>% 
  rvest::read_html()

adv_game_df = per_game_page %>% 
  rvest::html_nodes("table") %>% 
  .[11] %>% 
  rvest::html_table(fill = TRUE)


#### advanced scoring stats
scoring_adv = read.csv("/Users/cartererickson/Desktop/School/STAT/STAT 495R/pbpstats_export.csv", sep = ",")
scoring_adv = scoring_adv[order(scoring_adv$Name),]
head(scoring_adv)
##    Name GamesPlayed OffPoss   Points     FG2M     FG2A    Fg2Pct     FG3M
## 16  ATL          72    7086 113.6944 28.36111 53.87500 0.5264243 12.43056
## 3   BKN          72    7187 118.5694 28.97222 51.23611 0.5654649 14.16667
## 20  BOS          72    7094 112.6250 27.86111 52.54167 0.5302670 13.59722
## 23  CHA          72    7083 109.4583 26.25000 50.80556 0.5166758 13.68056
## 17  CHI          72    7136 110.6806 29.59722 54.63889 0.5416878 12.55556
## 7   CLE          72    7032 103.8333 28.58333 56.02778 0.5101636 10.00000
##        FG3A    Fg3Pct NonHeaveFg3Pct FtPoints PtsAssisted2s PtsUnassisted2s
## 16 33.36111 0.3726062      0.3744770 19.68056      28.58333        28.13889
## 3  36.11111 0.3923077      0.3941267 18.12500      31.05556        26.88889
## 20 36.36111 0.3739496      0.3766833 16.11111      26.63889        29.08333
## 23 37.02778 0.3694674      0.3713208 15.91667      30.94444        21.55556
## 17 33.97222 0.3695830      0.3714521 13.81944      32.47222        26.72222
## 7  29.73611 0.3362915      0.3372365 16.66667      30.86111        26.30556
##    PtsAssisted3s PtsUnassisted3s Assisted2sPct NonPutbacksAssisted2sPct
## 16      29.50000        7.791667     0.5039177                0.5635268
## 3       33.79167        8.708333     0.5359540                0.5783756
## 20      30.41667       10.375000     0.4780658                0.5272128
## 23      34.12500        6.916667     0.5894180                0.6300905
## 17      31.58333        6.083333     0.5485687                0.5952138
## 7       25.20833        4.791667     0.5398445                0.5884534
##    Assisted3sPct   FG3APct ShotQualityAvg    EfgPct     TsPct PtsPutbacks
## 16     0.7910615 0.3824232      0.5304958 0.5388473 0.5784286    6.000000
## 3      0.7950980 0.4134203      0.5354285 0.5749722 0.6088387    4.250000
## 20     0.7456588 0.4089986      0.5129291 0.5428058 0.5739742    5.194444
## 23     0.8314721 0.4215686      0.5330921 0.5324953 0.5632439    3.388889
## 17     0.8384956 0.3833856      0.5294977 0.5465517 0.5754854    4.638889
## 7      0.8402778 0.3467206      0.5345787 0.5081781 0.5416303    4.722222
##    Fg2aBlocked FG2APctBlocked Fg3aBlocked FG3APctBlocked
## 16    4.861111     0.09022944   0.1805556    0.005412157
## 3     4.347222     0.08484684   0.2361111    0.006538462
## 20    4.347222     0.08273857   0.2916667    0.008021390
## 23    4.652778     0.09158010   0.1666667    0.004501125
## 17    4.875000     0.08922217   0.1805556    0.005314800
## 7     5.597222     0.09990084   0.3055556    0.010275572
#### advanced assist stats
assist_adv = read.csv("/Users/cartererickson/Desktop/School/STAT/STAT 495R/pbpstats_export_assists.csv", sep = ",")
assist_adv = assist_adv[order(assist_adv$Name), ]
head(assist_adv)
##    Name GamesPlayed  Assists AssistPoints TwoPtAssists ThreePtAssists
## 14  ATL         385 24.27013     57.94805     14.86234       9.407792
## 21  BKN         390 23.95128     58.12821     13.72564      10.225641
## 12  BOS         390 24.12821     58.09744     14.28718       9.841026
## 29  CHA         383 23.62402     56.34987     14.52219       9.101828
## 3   CHI         383 23.51697     55.75196     14.79896       8.718016
## 1   CLE         383 22.68407     54.71279     13.33943       9.344648
##    AtRimAssists ShortMidRangeAssists LongMidRangeAssists Corner3Assists
## 14    10.228571             2.519481            2.114286       2.979221
## 21    10.138462             2.541026            1.046154       2.807692
## 12     8.897436             2.874359            2.515385       2.612821
## 29     9.490862             2.971279            2.060052       2.331593
## 3      9.104439             3.248042            2.446475       2.618799
## 1      9.161880             2.571802            1.605744       2.892950
##    Arc3Assists
## 14    6.428571
## 21    7.417949
## 12    7.228205
## 29    6.770235
## 3     6.099217
## 1     6.451697
#### shot distribution
shot_dist = read.csv("/Users/cartererickson/Desktop/School/STAT/STAT 495R/pbpstats_export_shotdist.csv", sep = ",")
shot_dist = shot_dist[order(shot_dist$Name),]
head(shot_dist)
##    Name GamesPlayed ShotQualityAvg AtRimFG3AFrequency Avg2ptShotDistance
## 14  ATL         385      0.5279957          0.7012365           6.697795
## 21  BKN         390      0.5239446          0.7357940           5.891474
## 12  BOS         390      0.5136905          0.6827757           7.202982
## 29  CHA         383      0.5156494          0.6877884           6.925698
## 3   CHI         383      0.5150962          0.6565109           7.195617
## 1   CLE         383      0.5244186          0.6834303           6.812521
##    Avg3ptShotDistance AtRimFGM AtRimFGA AtRimFrequency AtRimAccuracy
## 14           25.59966 17.89091 29.02078      0.3305035     0.6164862
## 21           25.44432 18.02821 29.15897      0.3320680     0.6182730
## 12           25.67451 16.40769 26.15385      0.2980278     0.6273529
## 29           25.74308 16.77023 27.92950      0.3204806     0.6004487
## 3            25.43475 17.31070 28.56919      0.3239770     0.6059221
## 1            25.53150 17.31593 27.49347      0.3192070     0.6298196
##    UnblockedAtRimAccuracy AtRimPctAssisted AtRimPctBlocked ShortMidRangeFGM
## 14              0.6908033        0.5717189       0.1075808         6.085714
## 21              0.6962765        0.5623667       0.1120295         6.497436
## 12              0.7013371        0.5422722       0.1054902         6.646154
## 29              0.6830072        0.5659349       0.1208750         6.313316
## 3               0.6833643        0.5259427       0.1133248         7.096606
## 1               0.7114353        0.5291013       0.1147198         6.629243
##    ShortMidRangeFGA ShortMidRangeFrequency ShortMidRangeAccuracy
## 14         15.73766              0.1792285             0.3866975
## 21         16.25641              0.1851311             0.3996845
## 12         16.37179              0.1865595             0.4059514
## 29         16.04700              0.1841333             0.3934266
## 3          17.44386              0.1978149             0.4068253
## 1          16.59008              0.1926155             0.3995908
##    UnblockedShortMidRangeAccuracy ShortMidRangePctAssisted
## 14                      0.4378621                0.4139991
## 21                      0.4431619                0.3910813
## 12                      0.4491423                0.4324846
## 29                      0.4321716                0.4706369
## 3                       0.4489594                0.4576895
## 1                       0.4406456                0.3879480
##    ShortMidRangePctBlocked LongMidRangeFGM LongMidRangeFGA
## 14              0.11685097        4.314286        10.49610
## 21              0.09810726        2.723077         6.94359
## 12              0.09616288        4.833333        11.46667
## 29              0.08965181        4.310705        11.16188
## 3               0.09384823        4.945170        12.84595
## 1               0.09316966        4.331593        10.67624
##    LongMidRangeFrequency LongMidRangeAccuracy UnblockedLongMidRangeAccuracy
## 14            0.11953499            0.4110369                     0.4222166
## 21            0.07907493            0.3921713                     0.4003015
## 12            0.13066472            0.4215116                     0.4269536
## 29            0.12807837            0.3861988                     0.3930017
## 3             0.14567419            0.3849593                     0.3909992
## 1             0.12395417            0.4057227                     0.4134064
##    LongMidRangePctAssisted LongMidRangePctBlocked Corner3FGM Corner3FGA
## 14               0.4900662             0.02647859   3.077922   7.909091
## 21               0.3841808             0.02031019   2.933333   7.723077
## 12               0.5204244             0.01274597   2.779487   7.158974
## 29               0.4778922             0.01730994   2.428198   6.167102
## 3                0.4947202             0.01544715   2.759791   6.924282
## 1                0.3707052             0.01858645   3.088773   7.806789
##    Corner3Frequency Corner3Accuracy UnblockedCorner3Accuracy Corner3PctAssisted
## 14       0.09007277       0.3891626                0.3931652          0.9679325
## 21       0.08795188       0.3798141                0.3833780          0.9571678
## 12       0.08157779       0.3882521                0.3904899          0.9400369
## 29       0.07076517       0.3937341                0.3979461          0.9602151
## 3        0.07852194       0.3985671                0.4022070          0.9489120
## 1        0.09063902       0.3956522                0.4004739          0.9366019
##    Corner3PctBlocked  Arc3FGM  Arc3FGA Arc3Frequency Arc3Accuracy
## 14       0.010180624 8.384416 24.64416     0.2806602    0.3402192
## 21       0.009296149 9.697436 27.72821     0.3157741    0.3497318
## 12       0.005730659 9.630769 26.60513     0.3031702    0.3619892
## 29       0.010584251 9.046997 25.84334     0.2965426    0.3500707
## 3        0.009049774 7.608355 22.39948     0.2540120    0.3396666
## 1        0.012040134 8.248042 23.56397     0.2735843    0.3500277
##    UnblockedArc3Accuracy Arc3PctAssisted Arc3PctBlocked NonHeaveArc3FGM
## 14             0.3426752       0.7667286    0.007166948            3227
## 21             0.3519777       0.7649392    0.006380618            3780
## 12             0.3645540       0.7505325    0.007035466            3755
## 29             0.3528513       0.7483405    0.007880380            3460
## 3              0.3418583       0.8016472    0.006411004            2913
## 1              0.3528820       0.7822096    0.008088643            3159
##    NonHeaveArc3FGA NonHeaveArc3Accuracy HeaveAttempts HeaveMakes
## 14            9398            0.3433709            89          1
## 21           10729            0.3523162            83          2
## 12           10281            0.3652368            94          1
## 29            9783            0.3536747           110          5
## 3             8504            0.3425447            74          1
## 1             8970            0.3521739            55          0
#### misc/pace
pace_adv = read.csv("/Users/cartererickson/Desktop/School/STAT/STAT 495R/pbpstats_export_pace.csv", sep = ",")
head(pace_adv)
##   Name GamesPlayed      Pace SecondsPerPossOff SecondsPerPossDef
## 1  CLE         383  96.89494          15.27293          14.45038
## 2  NOP         390 100.52310          13.92902          14.72068
## 3  CHI         383  97.69253          14.70056          14.77969
## 4  DAL         393  96.19231          15.39162          14.54789
## 5  DEN         391  97.03901          14.94897          14.72979
## 6  HOU         390  99.38628          14.22337          14.75512
##   SecondsExcludingORebsPerPossOff SecondsExcludingORebsPerPossDef
## 1                        14.65544                        13.88705
## 2                        13.38284                        14.13697
## 3                        14.11909                        14.23100
## 4                        14.80348                        13.97215
## 5                        14.29970                        14.16365
## 6                        13.59348                        14.15882
##   FirstChancePoints   Blocks Blocked2s  Blocked3s BlockedAtRim
## 1          94.85117 3.577023  3.472585 0.10443864     2.339426
## 2          98.99744 5.261538  4.989744 0.27179487     3.117949
## 3          92.71279 4.182768  4.031332 0.15143603     2.751958
## 4          95.44529 4.178117  4.083969 0.09414758     2.511450
## 5          97.06394 4.475703  4.235294 0.24040921     2.659847
## 6         100.96410 4.823077  4.597436 0.22564103     2.751282
##   BlockedShortMidRange BlockedLongMidRange BlockedCorner3 BlockedArc3
## 1             1.013055           0.1201044     0.02610966  0.07832898
## 2             1.643590           0.2282051     0.06153846  0.21025641
## 3             1.154047           0.1253264     0.04699739  0.10443864
## 4             1.437659           0.1348601     0.01526718  0.07888041
## 5             1.429668           0.1457801     0.09462916  0.14578005
## 6             1.669231           0.1769231     0.05641026  0.16923077
##   RecoveredBlocks BlocksRecoveredPct   Steals LostBallSteals BadPassSteals
## 1        1.986945          0.5554745 6.950392            821          1841
## 2        3.046154          0.5789474 7.661538           1028          1960
## 3        2.456919          0.5873908 7.848564           1045          1961
## 4        2.412214          0.5773447 6.704835           1013          1622
## 5        2.572890          0.5748571 7.664962           1077          1920
## 6        2.728205          0.5656566 8.310256           1151          2090
##   DefensiveGoaltends
## 1          0.1749347
## 2          0.1666667
## 3          0.1383812
## 4          0.1781170
## 5          0.2710997
## 6          0.2000000
#### rebounds
rebound_adv = read.csv("/Users/cartererickson/Desktop/School/STAT/STAT 495R/pbpstats_export_rebounds.csv", sep = ",")
head(rebound_adv)
##   Name GamesPlayed Rebounds DefRebounds FTDefRebounds DefFTReboundPct
## 1  CLE         383 47.52742    35.15405      2.112272       0.8949115
## 2  NOP         390 50.20000    37.76410      2.182051       0.8846154
## 3  CHI         383 48.53003    36.03133      2.240209       0.8800000
## 4  DAL         393 47.18066    35.97964      2.267176       0.8954774
## 5  DEN         391 49.75703    36.09207      2.375959       0.9198020
## 6  HOU         390 47.81795    35.78205      2.287179       0.8920000
##   DefTwoPtRebounds DefTwoPtReboundPct DefThreePtRebounds DefThreePtReboundPct
## 1         18.05483          0.6987672           14.98695            0.7881368
## 2         19.34872          0.7007801           16.23333            0.7973552
## 3         18.38381          0.7134461           15.40731            0.7994852
## 4         18.29008          0.7079681           15.42239            0.7961382
## 5         18.30179          0.7086552           15.41432            0.8005047
## 6         18.16667          0.6869304           15.32821            0.7805197
##   DefFGReboundPct OffRebounds FTOffRebounds OffFTReboundPct OffTwoPtRebounds
## 1       0.7366552    12.37337     0.2062663      0.08053007         8.154047
## 2       0.7417682    12.43590     0.3307692      0.11611161         8.341026
## 3       0.7502609    12.49869     0.2114883      0.09654350         8.587467
## 4       0.7457503    11.20102     0.2315522      0.09795479         6.519084
## 5       0.7478868    13.66496     0.2813299      0.11727079         8.928389
## 6       0.7268124    12.03590     0.2051282      0.07259528         6.235897
##   OffTwoPtReboundPct OffThreePtRebounds OffThreePtReboundPct OffFGReboundPct
## 1          0.3123000           4.013055            0.2067249       0.2672785
## 2          0.3015947           3.764103            0.1998639       0.2603828
## 3          0.2948982           3.699739            0.2029795       0.2595125
## 4          0.2699968           4.450382            0.2016371       0.2373507
## 5          0.3366442           4.455243            0.2344865       0.2940053
## 6          0.3144963           5.594872            0.2079878       0.2531826
##   DefAtRimReboundPct DefShortMidRangeReboundPct DefLongMidRangeReboundPct
## 1          0.6270492                  0.7026247                 0.8043564
## 2          0.6297801                  0.7035771                 0.8037383
## 3          0.6344351                  0.7322075                 0.8167267
## 4          0.6235775                  0.7035061                 0.8179211
## 5          0.6353591                  0.7174030                 0.8038685
## 6          0.6052083                  0.6965844                 0.7986196
##   DefArc3ReboundPct DefCorner3ReboundPct OffAtRimReboundPct
## 1         0.7902387            0.7812865          0.3860104
## 2         0.8013820            0.7827606          0.3835240
## 3         0.8027317            0.7877270          0.3765778
## 4         0.8021201            0.7748503          0.3503023
## 5         0.8104796            0.7709321          0.4055666
## 6         0.7828512            0.7735554          0.3479466
##   OffShortMidRangeReboundPct OffLongMidRangeReboundPct OffArc3ReboundPct
## 1                  0.3124502                 0.1921618         0.1967850
## 2                  0.2959875                 0.1710628         0.1878453
## 3                  0.2817369                 0.1942568         0.1965970
## 4                  0.2746459                 0.1719490         0.1923796
## 5                  0.3235656                 0.2385120         0.2239006
## 6                  0.3223235                 0.1787611         0.2045991
##   OffCorner3ReboundPct SelfOReb SelfORebPct
## 1            0.2384442      593  0.03824573
## 2            0.2387543      727  0.04471095
## 3            0.2249047      628  0.03890713
## 4            0.2390925      555  0.03342769
## 5            0.2773623      871  0.05451245
## 6            0.2191333      677  0.04130819

Cleaning Data

##### per game stats (FGA, 3PA, RB, AST, STL, BLK, TOV)
bref_per_game = select(per_game_df[[1]], -c(Rk, G, MP, FG, 'FG%', '3P','3P%', 
                                            '2P','2P%', FT, 'FT%', PF, PTS))
bref_per_game$PlayoffFlag = ifelse(grepl("\\*", bref_per_game$Team), 1, 0)
bref_per_game$Team = sub("\\*", "", bref_per_game$Team)
bref_per_game = bref_per_game[-31, ]
bref_per_game = rename(bref_per_game, Franchise = Team)


#### advanced team stats
colnames(adv_game_df[[1]]) = c("Rk", "Team", "Age", "W", "L", "PW", "PL", 
                               "MOV", "SOS", "SRS", "ORtg", "DRtg", "NRtg", 
                               "Pace", "FTr", "3PAr", "TS%", "NA", "OeFG%",
                               "OTOV%", "ORB%", "FT/FGA", "NA", "DeFG%", 
                               "OppTOV%", "DRB%", "OppFT/FGA", "NA.1", "Arena", 
                               "Attend.", "Attend./G")

bref_adv_game = select(adv_game_df[[1]], -c(Rk, W, L, PW, PL, MOV, SOS, SRS, 
                                            FTr, `TS%`, `NA`, NA.1, Arena,
                                            Attend., `Attend./G` ))
bref_adv_game = bref_adv_game[-1,]
bref_adv_game = bref_adv_game[-31, ]
bref_adv_game$Team = sub("\\*", "", bref_adv_game$Team)
bref_adv_game = rename(bref_adv_game, Franchise = Team)

# advanced scoring data
scoring_adv = scoring_adv %>% 
                dplyr::select(!c(GamesPlayed, FG2M, Fg2Pct, FG3M, Fg3Pct, 
                                 NonHeaveFg3Pct, Assisted2sPct, 
                                 NonPutbacksAssisted2sPct, Assisted3sPct, 
                                 EfgPct, TsPct, Fg2aBlocked, FG2APctBlocked, 
                                 Fg3aBlocked, FG3APctBlocked))

# advanced assist data
assist_adv = assist_adv %>% 
              dplyr::select(!c(GamesPlayed))

# shot distribution
shot_dist = shot_dist %>% 
              dplyr::select(!c(GamesPlayed, AtRimFGM, AtRimAccuracy, 
                               UnblockedAtRimAccuracy, AtRimPctAssisted, 
                               AtRimPctBlocked, ShortMidRangeFGM, 
                               ShortMidRangeAccuracy, 
                               UnblockedShortMidRangeAccuracy, 
                               ShortMidRangePctAssisted, 
                               ShortMidRangePctBlocked, LongMidRangeFGM, 
                               LongMidRangeAccuracy, 
                               UnblockedLongMidRangeAccuracy, 
                               LongMidRangePctAssisted, 
                               LongMidRangePctBlocked, Corner3FGM, 
                               Corner3Accuracy, UnblockedCorner3Accuracy, 
                               Corner3PctAssisted, Corner3PctBlocked, Arc3FGM, 
                               Arc3Accuracy, UnblockedArc3Accuracy, 
                               Arc3PctAssisted, Arc3PctBlocked, NonHeaveArc3FGM, 
                               NonHeaveArc3Accuracy, HeaveAttempts, HeaveMakes))

# misc/pace
pace_adv = pace_adv %>% 
  dplyr::select(!c(GamesPlayed, Blocked2s, Blocked3s, BlockedAtRim, 
                   BlockedShortMidRange, BlockedLongMidRange, BlockedCorner3, 
                   BlockedArc3, BlocksRecoveredPct, LostBallSteals,
                   BadPassSteals, DefensiveGoaltends, FirstChancePoints, Pace))

# rebounds
rebound_adv = rebound_adv %>% 
                dplyr::select(!c(GamesPlayed, FTDefRebounds, DefFTReboundPct, 
                                 DefTwoPtRebounds, DefTwoPtReboundPct, 
                                 DefThreePtRebounds, DefThreePtReboundPct, 
                                 DefFGReboundPct, FTOffRebounds, 
                                 OffFTReboundPct, OffTwoPtRebounds, 
                                 OffTwoPtReboundPct, OffThreePtRebounds, 
                                 OffThreePtReboundPct, OffFGReboundPct, 
                                 DefAtRimReboundPct, DefShortMidRangeReboundPct, 
                                 DefLongMidRangeReboundPct, 
                                 DefArc3ReboundPct, DefCorner3ReboundPct, 
                                 OffAtRimReboundPct, OffShortMidRangeReboundPct, 
                                 OffLongMidRangeReboundPct, OffArc3ReboundPct, 
                                 OffCorner3ReboundPct, SelfORebPct))

Combining Data

# Combining Basketball Reference data
bref_all = bref_per_game %>%
            dplyr::inner_join(bref_adv_game, by = "Franchise") %>%
            dplyr::select(!c(PlayoffFlag))


# Combining scoring, shot distribution, and assisting PBP data
pbp_offense = scoring_adv %>%
            dplyr::inner_join(assist_adv, by = "Name") %>%
            dplyr::inner_join(shot_dist, by = "Name")

# Combining Defense and Rebounding PBP data
pbp_defense = pace_adv %>% 
                dplyr::inner_join(rebound_adv, by = "Name")

# Changing Basketball Reference form to data frame and changing row names
bref_all = as.data.frame(bref_all)
rownames(bref_all) = bref_all$Franchise
bref_all = bref_all[,-c(1)]

# Changing row names of PBP data
rownames(pbp_offense) = pbp_offense$Name
pbp_offense = pbp_offense[, -c(1)]

rownames(pbp_defense) = pbp_defense$Name
pbp_defense = pbp_defense[, -c(1)]

# changing to numeric to allow for analysis to be performed
bref_all[, c(1:25)] <- sapply(bref_all[, c(1:25)], as.numeric)

EDA

summary(bref_all)
##       FGA            3PA            2PA            FTA            ORB       
##  Min.   :6029   Min.   :2046   Min.   :3246   Min.   :1258   Min.   :574.0  
##  1st Qu.:6282   1st Qu.:2266   1st Qu.:3666   1st Qu.:1510   1st Qu.:671.8  
##  Median :6364   Median :2476   Median :3873   Median :1541   Median :696.5  
##  Mean   :6366   Mean   :2494   Mean   :3872   Mean   :1571   Mean   :707.7  
##  3rd Qu.:6423   3rd Qu.:2668   3rd Qu.:4055   3rd Qu.:1621   3rd Qu.:757.8  
##  Max.   :6610   Max.   :3098   Max.   :4472   Max.   :1884   Max.   :845.0  
##       DRB            TRB            AST            STL             BLK       
##  Min.   :2307   Min.   :2981   Min.   :1531   Min.   :450.0   Min.   :286.0  
##  1st Qu.:2412   1st Qu.:3094   1st Qu.:1704   1st Qu.:504.2   1st Qu.:316.5  
##  Median :2490   Median :3186   Median :1767   Median :544.0   Median :351.0  
##  Mean   :2482   Mean   :3190   Mean   :1786   Mean   :545.2   Mean   :350.8  
##  3rd Qu.:2544   3rd Qu.:3253   3rd Qu.:1919   3rd Qu.:578.8   3rd Qu.:371.0  
##  Max.   :2724   Max.   :3474   Max.   :1991   Max.   :655.0   Max.   :460.0  
##       TOV              Age             ORtg            DRtg      
##  Min.   : 799.0   Min.   :22.80   Min.   :103.5   Min.   :107.1  
##  1st Qu.: 952.2   1st Qu.:25.18   1st Qu.:110.7   1st Qu.:111.2  
##  Median :1003.5   Median :26.25   Median :112.0   Median :112.5  
##  Mean   : 996.2   Mean   :26.09   Mean   :112.4   Mean   :112.3  
##  3rd Qu.:1058.0   3rd Qu.:27.00   3rd Qu.:115.6   3rd Qu.:113.8  
##  Max.   :1162.0   Max.   :28.80   Max.   :118.3   Max.   :117.2  
##       NRtg                Pace             3PAr            OeFG%       
##  Min.   :-10.50000   Min.   : 95.90   Min.   :0.3140   Min.   :0.4900  
##  1st Qu.: -1.87500   1st Qu.: 97.67   1st Qu.:0.3578   1st Qu.:0.5250  
##  Median :  0.50000   Median : 98.80   Median :0.3885   Median :0.5395  
##  Mean   :  0.02333   Mean   : 99.18   Mean   :0.3919   Mean   :0.5379  
##  3rd Qu.:  2.70000   3rd Qu.:100.33   3rd Qu.:0.4198   3rd Qu.:0.5497  
##  Max.   :  9.30000   Max.   :104.10   Max.   :0.4880   Max.   :0.5750  
##      OTOV%            ORB%           FT/FGA           DeFG%       
##  Min.   : 9.90   Min.   :17.90   Min.   :0.1560   Min.   :0.5070  
##  1st Qu.:11.90   1st Qu.:21.12   1st Qu.:0.1810   1st Qu.:0.5310  
##  Median :12.25   Median :22.10   Median :0.1895   Median :0.5390  
##  Mean   :12.36   Mean   :22.19   Mean   :0.1920   Mean   :0.5380  
##  3rd Qu.:13.07   3rd Qu.:23.45   3rd Qu.:0.1983   3rd Qu.:0.5467  
##  Max.   :14.20   Max.   :26.30   Max.   :0.2260   Max.   :0.5570  
##     OppTOV%           DRB%         OppFT/FGA     
##  Min.   :10.30   Min.   :74.90   Min.   :0.1570  
##  1st Qu.:11.53   1st Qu.:76.95   1st Qu.:0.1832  
##  Median :12.45   Median :77.85   Median :0.1955  
##  Mean   :12.36   Mean   :77.80   Mean   :0.1921  
##  3rd Qu.:13.07   3rd Qu.:78.58   3rd Qu.:0.2008  
##  Max.   :14.40   Max.   :80.30   Max.   :0.2340
# check for correlation to see how variables affect pace
team_cor = cor(bref_all)
corrplot(team_cor)

# correlation matrix of pbp data
pbp_offense_cor = cor(pbp_offense)
corrplot(pbp_offense_cor)

pbp_defense_cor = cor(pbp_defense)
corrplot(pbp_defense_cor)

# relationship between seconds per defensive possesion and defensive rebounds
ggplot(aes(x=SecondsPerPossDef,y=DefRebounds),data=pbp_defense) +
  geom_point()

# Scatterplot matrix for all variables
pairs(bref_all, horInd = 1:13, verInd = 1:13)

pairs(bref_all, horInd = 13:25, verInd = 13:25)

LASSO 1

# get modeling matrix (remove intercept column because LASSO & RIDGE do that automatically)
# basketball reference data
brefX = model.matrix(NRtg ~ ., data = bref_all)[,-c(1)]

lambda = exp(seq(-15, 15, length = 1000))
lasso_lm = glmnet(brefX, bref_all$NRtg, alpha = 1, lambda = lambda)
plot(lasso_lm)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

lasso_cv = cv.glmnet(brefX, bref_all$NRtg, alpha = 1, lambda = lambda)
lbestlam = lasso_cv$lambda.min
plot(lasso_cv)

lcoefs = predict(lasso_lm, s = lbestlam, type = "coefficient")
lvars = names(lcoefs[lcoefs[, 1]!=0, ])[-c(1)]

regular_lm_lasso = lm(bref_all$NRtg ~ . , data = data.frame(brefX[, lvars]))
regular_sumary_lasso = summary(regular_lm_lasso)

cbind(lcoefs[lcoefs!=0], regular_sumary_lasso$coefficients)
## <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
##                                Estimate   Std. Error       t value   Pr(>|t|)
## (Intercept) -4.454678e-06 -2.075628e-14 1.049645e-13 -1.977458e-01 0.84478187
## ORtg         9.999999e-01  1.000000e+00 4.635633e-16  2.157203e+15 0.00000000
## DRtg        -9.999999e-01 -1.000000e+00 1.299090e-15 -7.697694e+14 0.00000000
## X.DeFG..    -6.108012e-06 -4.573570e-13 2.593316e-13 -1.763600e+00 0.08954933
lcoefs
## 25 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -4.454678e-06
## FGA          .           
## `3PA`        .           
## `2PA`        .           
## FTA          .           
## ORB          .           
## DRB          .           
## TRB          .           
## AST          .           
## STL          .           
## BLK          .           
## TOV          .           
## Age          .           
## ORtg         9.999999e-01
## DRtg        -9.999999e-01
## Pace         .           
## `3PAr`       .           
## `OeFG%`      .           
## `OTOV%`      .           
## `ORB%`       .           
## `FT/FGA`     .           
## `DeFG%`     -6.108012e-06
## `OppTOV%`    .           
## `DRB%`       .           
## `OppFT/FGA`  .
# Removing variables recommended by LASSO and those I found unhelpful
bref_final = bref_all %>% 
              dplyr::select(!c(`2PA`, NRtg, `3PAr`, `OTOV%`,
                               `FT/FGA`, `DeFG%`, `OppTOV%`, `ORB%`, `DRB%`,
                               `OeFG%`, `OppFT/FGA`, Age))
bref_final
##                         FGA  3PA  FTA ORB  DRB  TRB  AST STL BLK  TOV  ORtg
## Milwaukee Bucks        6610 2669 1539 741 2724 3465 1834 585 334  995 117.2
## Brooklyn Nets          6289 2600 1623 640 2559 3199 1929 484 379  975 118.3
## Washington Wizards     6547 2088 1884 697 2557 3254 1835 528 297 1037 111.2
## Utah Jazz              6344 3098 1546 765 2709 3474 1703 474 371 1023 117.6
## Portland Trail Blazers 6558 2939 1558 766 2441 3207 1531 496 363  799 117.8
## Indiana Pacers         6567 2445 1493 648 2424 3072 1973 611 460  975 112.4
## Phoenix Suns           6357 2490 1347 630 2462 3092 1939 517 312  902 117.2
## Denver Nuggets         6422 2462 1406 758 2442 3200 1933 582 323  972 117.1
## New Orleans Pelicans   6412 2190 1878 845 2568 3413 1872 545 315 1052 113.5
## Los Angeles Clippers   6242 2498 1387 678 2501 3179 1756 509 295  950 117.6
## Sacramento Kings       6382 2400 1585 674 2307 2981 1836 543 358  963 113.6
## Golden State Warriors  6347 2789 1520 574 2524 3098 1991 587 342 1080 111.1
## Atlanta Hawks          6281 2402 1745 760 2525 3285 1737 503 342  953 115.7
## Philadelphia 76ers     6257 2169 1836 722 2522 3244 1706 655 447 1040 113.2
## Memphis Grizzlies      6608 2258 1536 803 2543 3346 1938 655 364  957 112.0
## Boston Celtics         6401 2618 1496 765 2421 3186 1689 556 383 1012 114.0
## Dallas Mavericks       6287 2744 1524 657 2463 3120 1647 450 311  869 115.4
## Minnesota Timberwolves 6546 2706 1662 757 2376 3133 1846 632 398 1027 109.5
## Toronto Raptors        6383 2831 1536 680 2314 2994 1735 618 389  952 112.0
## San Antonio Spurs      6518 2046 1584 669 2489 3158 1759 505 366  821 111.0
## Chicago Bulls          6380 2446 1258 693 2544 3237 1927 482 304 1089 111.1
## Los Angeles Lakers     6197 2248 1679 695 2490 3185 1775 562 386 1095 109.9
## Charlotte Hornets      6324 2666 1505 762 2389 3151 1933 565 344 1069 110.9
## Houston Rockets        6372 2923 1606 671 2396 3067 1699 546 361 1060 107.1
## Miami Heat             6029 2606 1520 579 2409 2988 1895 569 286 1013 111.2
## New York Knicks        6225 2163 1506 696 2554 3250 1541 507 365  932 110.6
## Detroit Pistons        6162 2370 1683 694 2381 3075 1743 531 371 1075 108.0
## Oklahoma City Thunder  6338 2529 1536 715 2568 3283 1588 504 316 1162 103.5
## Orlando Magic          6423 2288 1543 747 2525 3272 1571 496 318  924 105.1
## Cleveland Cavaliers    6175 2141 1614 751 2327 3078 1716 559 325 1114 105.8
##                         DRtg  Pace
## Milwaukee Bucks        111.4 102.2
## Brooklyn Nets          113.8  99.5
## Washington Wizards     113.0 104.1
## Utah Jazz              108.3  98.5
## Portland Trail Blazers 116.0  98.4
## Indiana Pacers         112.4 101.6
## Phoenix Suns           111.3  97.2
## Denver Nuggets         112.1  97.1
## New Orleans Pelicans   113.8 100.1
## Los Angeles Clippers   111.2  96.9
## Sacramento Kings       117.2 100.0
## Golden State Warriors  110.1 102.2
## Atlanta Hawks          113.3  97.6
## Philadelphia 76ers     107.6  99.5
## Memphis Grizzlies      111.0 100.4
## Boston Celtics         112.5  98.3
## Dallas Mavericks       113.0  97.3
## Minnesota Timberwolves 115.0 101.6
## Toronto Raptors        112.5  99.2
## San Antonio Spurs      112.8  98.9
## Chicago Bulls          112.0  99.0
## Los Angeles Lakers     107.1  98.7
## Charlotte Hornets      112.8  98.3
## Houston Rockets        114.9 101.4
## Miami Heat             111.2  96.6
## New York Knicks        108.2  95.9
## Detroit Pistons        112.5  97.9
## Oklahoma City Thunder  114.0 101.0
## Orlando Magic          114.5  98.7
## Cleveland Cavaliers    114.4  97.3

LASSO 2

# get modeling matrix (remove intercept column because LASSO & RIDGE do that automatically)
# basketball reference data
pbpoX = model.matrix(OffPoss ~ ., data = pbp_offense)[,-c(1)]

lambda = exp(seq(-15, 15, length = 1000))
lasso_lm = glmnet(pbpoX, pbp_offense$OffPoss, alpha = 1, lambda = lambda)
## Warning: from glmnet Fortran code (error code -730); Convergence for 730th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
plot(lasso_lm)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

lasso_cv = cv.glmnet(pbpoX, pbp_offense$OffPoss, alpha = 1, lambda = lambda)
## Warning: from glmnet Fortran code (error code -730); Convergence for 730th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -740); Convergence for 740th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -661); Convergence for 661th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -636); Convergence for 636th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -671); Convergence for 671th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
lbestlam = lasso_cv$lambda.min
plot(lasso_cv)

lcoefs = predict(lasso_lm, s = lbestlam, type = "coefficient")
lvars = names(lcoefs[lcoefs[, 1]!=0, ])[-c(1)]

regular_lm_lasso = lm(pbp_offense$OffPoss ~ . , data = data.frame(pbpoX[, lvars]))
regular_sumary_lasso = summary(regular_lm_lasso)

cbind(lcoefs[lcoefs!=0], regular_sumary_lasso$coefficients)
## <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
##                                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)        106.8799456  50.226619 2519.456250  0.0199355 9.843142e-01
## FG2A                44.5868615  59.071783    9.661006  6.1144545 8.922583e-06
## FG3A                46.9471921  75.129683   13.062477  5.7515647 1.881936e-05
## FtPoints            16.6628720  20.818434   11.044274  1.8849979 7.567999e-02
## PtsAssisted2s        0.5541723  -3.551537    6.230448 -0.5700291 5.757045e-01
## PtsAssisted3s       -8.6827225 -21.224589    9.358974 -2.2678328 3.588468e-02
## PtsUnassisted3s     -5.5118119 -19.171310   10.404971 -1.8425145 8.193452e-02
## ShotQualityAvg.x   310.1949445 345.753350 1546.263612  0.2236057 8.255823e-01
## PtsPutbacks        -36.0523029 -62.648877   22.826839 -2.7445271 1.332702e-02
## AtRimAssists        49.7276794  65.193560   20.600352  3.1646819 5.362787e-03
## Avg3ptShotDistance  95.6834436  48.770663   93.862937  0.5195945 6.096771e-01
## Corner3FGA          16.4559906  15.771654   18.935731  0.8329044 4.158138e-01
lcoefs
## 36 x 1 sparse Matrix of class "dgCMatrix"
##                                 s1
## (Intercept)            106.8799456
## Points                   .        
## FG2A                    44.5868615
## FG3A                    46.9471921
## FtPoints                16.6628720
## PtsAssisted2s            0.5541723
## PtsUnassisted2s          .        
## PtsAssisted3s           -8.6827225
## PtsUnassisted3s         -5.5118119
## FG3APct                  .        
## ShotQualityAvg.x       310.1949445
## PtsPutbacks            -36.0523029
## Assists                  .        
## AssistPoints             .        
## TwoPtAssists             .        
## ThreePtAssists           .        
## AtRimAssists            49.7276794
## ShortMidRangeAssists     .        
## LongMidRangeAssists      .        
## Corner3Assists           .        
## Arc3Assists              .        
## ShotQualityAvg.y         .        
## AtRimFG3AFrequency       .        
## Avg2ptShotDistance       .        
## Avg3ptShotDistance      95.6834436
## AtRimFGA                 .        
## AtRimFrequency           .        
## ShortMidRangeFGA         .        
## ShortMidRangeFrequency   .        
## LongMidRangeFGA          .        
## LongMidRangeFrequency    .        
## Corner3FGA              16.4559906
## Corner3Frequency         .        
## Arc3FGA                  .        
## Arc3Frequency            .        
## NonHeaveArc3FGA          .
# Removing variables recommended by LASSO
pbpo_final = pbp_offense %>% 
              dplyr::select(!c(Points, PtsAssisted2s, FG3APct, Assists, 
                               AssistPoints, TwoPtAssists, LongMidRangeAssists, 
                               Arc3Assists, NonHeaveArc3FGA, Corner3Frequency, 
                               ShortMidRangeFrequency, LongMidRangeFrequency, 
                               AtRimFrequency, Avg2ptShotDistance, 
                               AtRimFG3AFrequency, ShotQualityAvg.y))

rename(pbpo_final, ShotQualityAvg = ShotQualityAvg.x)
##     OffPoss     FG2A     FG3A FtPoints PtsUnassisted2s PtsAssisted3s
## ATL    7086 53.87500 33.36111 19.68056        28.13889      29.50000
## BKN    7187 51.23611 36.11111 18.12500        26.88889      33.79167
## BOS    7094 52.54167 36.36111 16.11111        29.08333      30.41667
## CHA    7083 50.80556 37.02778 15.91667        21.55556      34.12500
## CHI    7136 54.63889 33.97222 13.81944        26.72222      31.58333
## CLE    7032 56.02778 29.73611 16.66667        26.30556      25.20833
## DAL    6996 49.20833 38.11111 16.47222        30.27778      32.20833
## DEN    7044 55.00000 34.19444 15.68056        28.97222      32.87500
## DET    7050 52.66667 32.91667 17.75000        26.61111      31.25000
## GSW    7344 49.41667 38.73611 16.56944        21.63889      35.20833
## HOU    7262 47.90278 40.59722 16.51389        26.16667      33.62500
## IND    7353 57.25000 33.95833 16.43056        28.75000      32.58333
## LAC    6970 52.00000 34.69444 16.16667        30.08333      35.70833
## LAL    7099 54.84722 31.22222 17.23611        29.22222      29.08333
## MEM    7229 60.41667 31.36111 16.44444        29.00000      29.16667
## MIA    6949 47.54167 36.19444 16.68056        23.02778      34.66667
## MIL    7348 54.73611 37.06944 16.23611        32.25000      33.83333
## MIN    7305 53.33333 37.58333 17.56944        25.83333      32.83333
## NOP    7224 58.63889 30.41667 19.01389        30.27778      27.66667
## NYK    6926 56.41667 30.04167 16.40278        32.33333      29.70833
## OKC    7274 52.90278 35.12500 15.45833        29.33333      29.41667
## ORL    7081 57.43056 31.77778 16.61111        29.08333      27.00000
## PHI    7179 56.77778 30.12500 19.56944        33.02778      30.20833
## PHX    7062 53.70833 34.58333 15.61111        28.80556      33.25000
## POR    7078 50.26389 40.81944 17.80556        30.41667      32.70833
## SAC    7189 55.30556 33.33333 16.40278        29.47222      29.33333
## SAS    7174 62.11111 28.41667 17.41667        32.41667      26.16667
## TOR    7105 49.33333 39.31944 17.38889        26.08333      35.66667
## UTA    7124 45.08333 43.02778 17.15278        27.27778      38.29167
## WAS    7506 61.91667 29.00000 20.12500        32.91667      26.87500
##     PtsUnassisted3s ShotQualityAvg PtsPutbacks ThreePtAssists AtRimAssists
## ATL        7.791667      0.5304958    6.000000       9.407792    10.228571
## BKN        8.708333      0.5354285    4.250000      10.225641    10.138462
## BOS       10.375000      0.5129291    5.194444       9.841026     8.897436
## CHA        6.916667      0.5330921    3.388889       9.101828     9.490862
## CHI        6.083333      0.5294977    4.638889       8.718016     9.104439
## CLE        4.791667      0.5345787    4.722222       9.344648     9.161880
## DAL        9.208333      0.5239492    4.000000      10.170483     7.969466
## DEN        5.750000      0.5126785    6.305556       9.780051    10.690537
## DET        3.416667      0.5214913    4.805556       9.335938     8.653646
## GSW        8.458333      0.5204043    3.416667      10.049608    11.647520
## HOU        7.708333      0.5324225    4.472222      10.743590    10.046154
## IND        4.458333      0.5377273    4.611111       8.437340     9.723785
## LAC        7.083333      0.5147173    4.583333       9.002564     9.694872
## LAL        4.083333      0.5230556    4.722222       8.473008    11.053985
## MEM        4.291667      0.5248266    6.472222       8.480818     9.263427
## MIA        4.166667      0.5220335    2.944444      10.281330     9.317136
## MIL        9.416667      0.5283572    4.527778       9.831202    10.703325
## MIN        6.500000      0.5407673    5.416667       8.510471    10.060209
## NOP        4.083333      0.5358961    6.388889       9.302564    11.112821
## NYK        5.583333      0.5154981    4.333333       8.132812     8.289062
## OKC        6.291667      0.5264936    5.194444       8.541026     9.446154
## ORL        5.666667      0.5030609    5.333333       8.831202     8.132992
## PHI        3.583333      0.5152936    5.194444      10.028133     9.979540
## PHX        5.916667      0.5136009    4.111111       8.411765     9.994885
## POR       14.458333      0.5247641    5.777778       8.622449     8.178571
## SAC        7.083333      0.5200065    4.611111       8.892308     9.076923
## SAS        3.666667      0.5095367    4.416667       8.449871     7.984576
## TOR        7.708333      0.5201906    4.555556      10.028205     8.884615
## UTA       11.916667      0.5194589    4.861111       9.882051     9.733333
## WAS        3.666667      0.4994307    4.416667       9.069231     9.687179
##     ShortMidRangeAssists Corner3Assists Avg3ptShotDistance AtRimFGA
## ATL             2.519481       2.979221           25.59966 29.02078
## BKN             2.541026       2.807692           25.44432 29.15897
## BOS             2.874359       2.612821           25.67451 26.15385
## CHA             2.971279       2.331593           25.74308 27.92950
## CHI             3.248042       2.618799           25.43475 28.56919
## CLE             2.571802       2.892950           25.53150 27.49347
## DAL             2.300254       2.735369           25.61850 23.07888
## DEN             3.641944       2.514066           25.42626 28.69054
## DET             2.895833       2.786458           25.68576 26.77083
## GSW             3.234987       2.579634           26.11247 25.69974
## HOU             1.333333       3.582051           25.64567 28.76667
## IND             3.145780       2.552430           25.90705 28.70844
## LAC             2.438462       2.843590           25.55084 28.10513
## LAL             2.586118       2.478149           25.61972 31.60154
## MEM             3.777494       2.370844           25.54225 27.30435
## MIA             2.682864       3.117647           25.43866 26.61893
## MIL             2.485934       2.846547           25.79341 29.98721
## MIN             3.065445       2.353403           25.40405 28.94241
## NOP             3.210256       2.635897           25.45126 30.96154
## NYK             2.380208       2.325521           25.51956 28.18750
## OKC             2.294872       2.569231           25.41701 30.28462
## ORL             3.358056       1.897698           25.69389 25.03581
## PHI             2.951407       2.329923           25.70648 27.37340
## PHX             3.135550       2.644501           25.47443 27.86701
## POR             2.204082       2.142857           25.71303 28.38010
## SAC             3.423077       2.525641           25.44977 26.40256
## SAS             3.637532       2.416452           25.40403 23.59640
## TOR             2.751282       3.248718           25.54064 27.88974
## UTA             2.023077       3.664103           25.24738 26.49231
## WAS             3.517949       2.264103           25.77688 26.93590
##     ShortMidRangeFGA LongMidRangeFGA Corner3FGA  Arc3FGA Arc3Frequency
## ATL         15.73766       10.496104   7.909091 24.64416     0.2806602
## BKN         16.25641        6.943590   7.723077 27.72821     0.3157741
## BOS         16.37179       11.466667   7.158974 26.60513     0.3031702
## CHA         16.04700       11.161880   6.167102 25.84334     0.2965426
## CHI         17.44386       12.845953   6.924282 22.39948     0.2540120
## CLE         16.59008       10.676240   7.806789 23.56397     0.2735843
## DAL         15.85496       11.882952   7.267176 28.36132     0.3280841
## DEN         18.32992       10.324808   6.414322 24.70588     0.2792715
## DET         18.67188       11.328125   7.653646 22.73438     0.2608384
## GSW         15.01305       14.067885   6.671018 26.16710     0.2986471
## HOU         10.93846        4.971795  10.241026 32.53077     0.3719982
## IND         16.30691       15.611253   6.391304 20.37852     0.2331734
## LAC         15.87949       12.943590   7.276923 22.12564     0.2562892
## LAL         15.86889       11.084833   6.874036 22.75321     0.2580241
## MEM         19.70077       10.703325   6.562660 22.19437     0.2566848
## MIA         16.58056       10.184143   8.652174 23.48849     0.2746411
## MIL         15.00256       10.322251   7.557545 24.70077     0.2820678
## MIN         17.63874       12.863874   6.602094 22.64136     0.2552909
## NOP         17.25128       11.123077   7.248718 23.01538     0.2568681
## NYK         17.22135       15.575521   6.169271 20.89323     0.2372967
## OKC         15.72821       11.951282   7.346154 23.38205     0.2636311
## ORL         18.41176       14.237852   5.465473 24.76215     0.2816664
## PHI         17.67519       11.621483   6.002558 24.27366     0.2791799
## PHX         17.98721       12.971867   7.378517 21.64962     0.2464266
## POR         15.06888       13.591837   5.885204 26.16582     0.2936949
## SAC         19.47179       12.810256   6.533333 22.42308     0.2558514
## SAS         20.41388       17.521851   5.964010 19.85090     0.2272647
## TOR         16.07179       10.243590   8.566667 24.67949     0.2822084
## UTA         16.08462        8.425641   9.505128 23.76923     0.2820372
## WAS         17.44872       15.269231   6.497436 22.64872     0.2550531

Cluster Analysis

# Standardize variables
scale_adv_game = scale(bref_final)
scale_pbp_offense = scale(pbpo_final)
scale_pbp_defense = scale(pbp_defense)


# Determine number of clusters for Basketball Reference data
wss <- (nrow(scale_adv_game)-1)*sum(apply(scale_adv_game,2,var))
  for (i in 2:25) wss[i] <- sum(kmeans(scale_adv_game,
                                     centers=i)$withinss)
plot(1:25, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

# Determine number of clusters for PBP shot and assist data
wss2 <- (nrow(scale_pbp_offense)-1)*sum(apply(scale_pbp_offense,2,var))
  for (i in 2:21) wss2[i] <- sum(kmeans(scale_pbp_offense,
                                     centers=i)$withinss)
plot(1:21, wss2, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

# Determine number of clusters for PBP rebound, pace, and shot dist data
wss3 <- (nrow(scale_pbp_defense)-1)*sum(apply(scale_pbp_defense,2,var))
  for (i in 2:28) wss3[i] <- sum(kmeans(scale_pbp_defense,
                                     centers=i)$withinss)
plot(1:28, wss3, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

K-Means Cluster Analysis 1

# K-Means Cluster Analysis for Basketball Reference data
fit <- kmeans(scale_adv_game, 6) # 6 cluster solution

# get cluster means
aggregate(scale_adv_game,by=list(fit$cluster),FUN=mean)
##   Group.1         FGA        3PA         FTA         ORB         DRB        TRB
## 1       1  0.24820470  0.7219853 -0.04381108  0.16866289 -1.13941025 -0.8144573
## 2       2  0.87047760 -0.5768048  0.57901674  0.81259951  0.98435238  1.1669719
## 3       3  0.64035404  0.4514550 -0.45942432 -1.56376907 -0.07752893 -0.8170627
## 4       4 -0.98460010 -1.1044000  0.72821124 -0.05496363  0.39957219  0.2876394
## 5       5 -1.71958659 -0.4470482  0.24510524 -0.53454822 -1.08805561 -1.1140817
## 6       6  0.01197582  0.2880352 -0.52123347 -0.09915007  0.31508553  0.1998365
##            AST        STL        BLK        TOV       ORtg        DRtg
## 1  0.028383847  0.5868219  0.5105069  0.2091850 -0.2924702  0.76578960
## 2 -0.097208397  0.1299214 -0.6421219  0.2963454 -0.4841173  0.26087338
## 3  1.477718346  1.0033162  1.2004888  0.3716203 -0.1508179 -0.45442459
## 4 -0.843226328  0.5495239  1.1606054  0.3102119 -0.2799714 -1.97618597
## 5 -0.009293826  0.1454622 -0.5623552  0.8450599 -1.0048975  0.15568250
## 6  0.001507107 -0.8392050 -0.3406038 -0.7242239  0.8815767  0.02103818
##         Pace
## 1  0.3170937
## 2  0.9734435
## 3  1.3911206
## 4 -0.5864528
## 5 -0.9785579
## 6 -0.5830432
# append cluster assignment
clust_adv_game <- data.frame(scale_adv_game, fit$cluster)

# Cluster Plot against 1st 2 principal components
# vary parameters for most readable graph
clusplot(scale_adv_game, fit$cluster, color=TRUE, shade=TRUE,
         labels=2, lines=0)

K-Means Cluster Analysis 2

# K-Means Cluster Analysis for PBP shot and assist data
fit2 <- kmeans(scale_pbp_offense, 6) # 6 cluster solution

# get cluster means
aggregate(scale_pbp_offense,by=list(fit2$cluster),FUN=mean)
##   Group.1     OffPoss       FG2A        FG3A    FtPoints PtsUnassisted2s
## 1       1 -0.17933897 -1.3391055  1.16609561  0.15194415     -0.83966646
## 2       2  0.05575049 -0.6301804  0.89397399 -0.33345210     -0.28835466
## 3       3  1.40881000  1.9842318 -1.56834107  1.33896487      1.42752590
## 4       4 -0.02265965  0.1309756 -0.09520147 -0.77592683      0.14658876
## 5       5 -0.29145666  0.4985675 -0.91374351  0.87746029      0.02911422
## 6       6 -0.15516398  0.5995079 -0.62270926  0.08078479      0.32180504
##   PtsAssisted3s PtsUnassisted3s ShotQualityAvg.x PtsPutbacks ThreePtAssists
## 1     1.1801724       0.5343864        0.3142766  -0.6758365     1.34846958
## 2     0.5101673       1.2010570        0.1177746  -0.4778973     0.47162693
## 3    -1.5589662      -1.1191667       -1.8002683  -0.4396218    -0.70309481
## 4     0.2173920      -0.2417712        0.2713339  -0.1068590    -1.01413738
## 5    -1.1352865      -0.5443602        0.8176423   0.7906631    -0.18421500
## 6    -0.4707552      -0.6283502       -0.6412333   0.7851951    -0.05501636
##   AtRimAssists ShortMidRangeAssists Corner3Assists Avg3ptShotDistance
## 1   0.08293149          -1.04506939      1.6214609       -0.689327291
## 2  -0.06679966          -0.29431150     -0.2944604        1.073089193
## 3  -0.74370880           1.34368145     -0.8135606        0.027615043
## 4  -0.07503682          -0.04477176     -0.2512681       -0.315205765
## 5   0.88577303          -0.21520090      0.2346806       -0.197542323
## 6  -0.25737937           0.91300983     -0.6488778       -0.008419858
##      AtRimFGA ShortMidRangeFGA LongMidRangeFGA Corner3FGA    Arc3FGA
## 1  0.02709513      -0.88750832    -1.437959671  1.6717476  0.9297467
## 2 -0.45136947      -0.67813734     0.094181558 -0.3386467  0.8786837
## 3 -1.29197575       1.21267714     1.776398512 -0.8557200 -1.0778849
## 4  0.48839948       0.06595976     0.661788443 -0.2590110 -0.8169562
## 5  1.06594707      -0.22821716    -0.388328351  0.2917565 -0.2096058
## 6 -0.42098199       1.08869361    -0.001215618 -0.6615611 -0.2013249
##   Arc3Frequency
## 1     1.0279797
## 2     0.8615833
## 3    -1.1231131
## 4    -0.8453387
## 5    -0.2473869
## 6    -0.1927090
# append cluster assignment
clust_pbp_offense <- data.frame(scale_pbp_offense, fit2$cluster)

# Cluster Plot against 1st 2 principal components
# vary parameters for most readable graph
clusplot(scale_pbp_offense, fit2$cluster, color=TRUE, shade=TRUE,
         labels=2, lines=0)

K-Means Cluster Analysis 3

# K-Means Cluster Analysis for PBP rebounding, pace, and shot dist data
fit3 <- kmeans(scale_pbp_defense, 6) # 6 cluster solution

# get cluster means
aggregate(scale_pbp_defense,by=list(fit3$cluster),FUN=mean)
##   Group.1 SecondsPerPossOff SecondsPerPossDef SecondsExcludingORebsPerPossOff
## 1       1        -0.3282081         1.2038265                      -0.5061559
## 2       2        -0.3521976         0.3937434                      -0.3437141
## 3       3        -1.7754061         1.6716000                      -1.6799878
## 4       4         1.4501867        -1.1252650                       1.4635925
## 5       5        -0.7986030        -0.5193231                      -0.7834978
## 6       6         0.8916054        -0.3534332                       0.8895803
##   SecondsExcludingORebsPerPossDef     Blocks RecoveredBlocks     Steals
## 1                       1.3345300 -0.3232369     -0.15915016  0.5421403
## 2                       0.3727979 -0.3146294     -0.37625289  0.7075888
## 3                       1.5723189  1.8125690      1.77233710  0.8985093
## 4                      -1.0493668 -1.9404090     -1.91094987 -1.5368699
## 5                      -0.5565556  0.1938767     -0.02016582 -0.1178991
## 6                      -0.3358013  0.2255929      0.37265440 -0.4760875
##      Rebounds DefRebounds OffRebounds   SelfOReb
## 1  1.29573764 -0.01135785   1.9368062  2.3240228
## 2 -0.87726601 -0.68419780  -0.3967800 -0.3982174
## 3  0.31972876  1.66671444  -1.7287994 -0.8949506
## 4 -1.25841230 -0.81296165  -0.7918872 -1.1014776
## 5  1.00438361  0.90593149   0.2922360  0.2801172
## 6  0.02777182 -0.16467964   0.2588585  0.0849847
# append cluster assignment
clust_pbp_defense <- data.frame(scale_pbp_defense, fit3$cluster)

# Cluster Plot against 1st 2 principal components
# vary parameters for most readable graph
clusplot(scale_pbp_defense, fit3$cluster, color=TRUE, shade=TRUE,
         labels=2, lines=0)

Visualizing Clusters

bref_final$Cluster = clust_adv_game$fit.cluster
                              
ggplot(clust_adv_game, aes(x=TOV, y=Pace, 
                               shape = as.factor(fit.cluster), 
                              color = as.factor(fit.cluster))) + 
                          geom_point(size=2.5)

reactable(bref_final, searchable = TRUE, 
          groupBy = "Cluster", 
          columns = list(
           FGA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           `3PA` = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           FTA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           ORB = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           DRB = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           TRB = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           AST = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           STL = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           BLK = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           TOV = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           ORtg = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           DRtg = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           Pace = colDef(aggregate = "mean", format = colFormat(digits = 1))
           )
          )
pbpo_final$Cluster = clust_pbp_offense$fit2.cluster
                              
ggplot(clust_pbp_offense, aes(x=OffPoss, y=PtsAssisted3s, 
                               shape = as.factor(fit2.cluster), 
                              color = as.factor(fit2.cluster))) + 
                          geom_point(size=2.5)

reactable(pbpo_final, searchable = TRUE, 
          groupBy = "Cluster",
          columns = list(
           OffPoss = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           FG2A = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           FG3A = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           FtPoints = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           PtsUnassisted2s = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           PtsAssisted3s = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           PtsUnassisted3s = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           ShotQualityAvg.x = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           PtsPutbacks = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           ThreePtAssists = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           AtRimAssists = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           ShortMidRangeAssists = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           Corner3Assists = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           Avg3ptShotDistance = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           AtRimFGA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           ShortMidRangeFGA = colDef(aggregate = "mean", format = colFormat(digits = 1)), 
           LongMidRangeFGA = colDef(aggregate = "mean", format = colFormat(digits = 1)), 
           Corner3FGA = colDef(aggregate = "mean", format = colFormat(digits = 1)), 
           Arc3FGA = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           Arc3Frequency = colDef(aggregate = "mean", format = colFormat(digits = 1))
           )
          )
pbp_defense$Cluster = clust_pbp_defense$fit3.cluster
                              
ggplot(clust_pbp_defense, aes(x=Blocks, y=SecondsPerPossDef, 
                               shape = as.factor(fit3.cluster), 
                              color = as.factor(fit3.cluster))) + 
                          geom_point(size=2.5)

reactable(pbp_defense, searchable = TRUE, 
          groupBy = "Cluster",
          columns = list(
           SecondsPerPossOff = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           SecondsPerPossDef = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           SecondsExcludingORebsPerPossOff = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           SecondsExcludingORebsPerPossDef = colDef(aggregate = "mean", format = colFormat(digits = 1)), 
           Blocks = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           RecoveredBlocks = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           Steals = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           Rebounds = colDef(aggregate = "mean", format = colFormat(digits = 1)), 
           DefRebounds = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           OffRebounds = colDef(aggregate = "mean", format = colFormat(digits = 1)),
           SelfOReb = colDef(aggregate = "mean", format = colFormat(digits = 1))
           )
          )